In [22]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, cross_validation, pipeline, manifold, random_projection

In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
print(len(aBookCollection))
del myShelf


597

In [3]:
with bc.RandomContext(123):
    anotherCollection = aBookCollection.selection().exclude_authors_below(7)
    print(len(anotherCollection))
    train_collection, test_collection = anotherCollection.selection().split_per_author_percentage(0.7)


508

Try entropies


In [6]:
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
matrix_extractor = bc.SklExtractor(extractor)
predictor = pipeline.Pipeline([
		('extractor', matrix_extractor),
		('svd', decomposition.TruncatedSVD(50)),
		('svm', svm.SVC(class_weight='auto'))])
books, authors = train_collection.as_arrays()
scores = cross_validation.cross_val_score(predictor, books, authors,
	scoring='accuracy', cv=cross_validation.StratifiedKFold(authors, n_folds=4))
    #scoring='accuracy', cv=cross_validation.ShuffleSplit(len(authors)))

In [7]:
scores


Out[7]:
array([ 0.62222222,  0.68539326,  0.66292135,  0.70786517])

Visualize entropies


In [5]:
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
matrix_extractor = bc.SklExtractor(extractor)
books, authors = train_collection.as_arrays()
#predictor.fit(books, authors)

In [6]:
matrix_extractor.fit(books)
books_matrix = matrix_extractor.transform(books)
authors_indexer = bc.NumericIndexer(authors)
authors_matrix = [authors_indexer.encode(a) for a in authors]

In [7]:
# factor analysis, independent component analysis, non-negative matrix factorization,
# principal component analysis, etc aren't possible because it's sparse

svd = decomposition.TruncatedSVD(n_components=100, n_iterations=10)
svd_matrix = svd.fit_transform(books_matrix)

In [190]:
U, Sigma, VT = svd._fit(books_matrix)
pandas.Series(Sigma)[:30].plot(kind='bar', figsize=(10, 8))


Out[190]:
<matplotlib.axes.AxesSubplot at 0x7f0128d87f10>

In [184]:
books_sizes = []
for b in train_collection.books():
    total = len(train_collection)
    current = len(train_collection.books_by(b.author()))
    size = 150 * (1-(current/total))**10
    books_sizes.append(size)

In [185]:
# method=... argument does't help much
lle = manifold.LocallyLinearEmbedding(40, n_components=2)
lle_matrix = isomap.fit_transform(svd_matrix)
plt.figsize(12, 12)
scatter(isomap_matrix[:, 0], isomap_matrix[:, 1], c=authors_matrix, s=books_sizes)


Out[185]:
<matplotlib.collections.PathCollection at 0x7f01116ca7d0>

In [11]:
# similar to manifold.Isomap
pca = decomposition.PCA(n_components=2)
pca_matrix = pca.fit_transform(svd_matrix)
plt.figsize(8, 8)
scatter(pca_matrix[:, 0], pca_matrix[:, 1], c=authors_matrix)


Out[11]:
<matplotlib.collections.PathCollection at 0x7f0110642f90>

In [20]:
mds = manifold.MDS(n_components=2)
mds_matrix = mds.fit_transform(svd_matrix)
plt.figsize(8, 8)
scatter(mds_matrix[:, 0], mds_matrix[:, 1], c=authors_matrix)


Out[20]:
<matplotlib.collections.PathCollection at 0x7f01106a92d0>

In [21]:
spectral = manifold.SpectralEmbedding(n_components=2)
spectral_matrix = spectral.fit_transform(svd_matrix)
plt.figsize(8, 8)
scatter(spectral_matrix[:, 0], spectral_matrix[:, 1], c=authors_matrix)


Out[21]:
<matplotlib.collections.PathCollection at 0x7f0121fbabd0>

Try frequencies


In [7]:
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.FrequenciesExtractor(tokenizer)
matrix_extractor = bc.SklExtractor(extractor)
predictor = pipeline.Pipeline([
		('extractor', matrix_extractor),
		('svd', decomposition.TruncatedSVD(50)),
		('svm', svm.SVC(class_weight='auto'))])
books, authors = train_collection.as_arrays()
scores = cross_validation.cross_val_score(predictor, books, authors,
	scoring='accuracy', cv=cross_validation.StratifiedKFold(authors, n_folds=4))

In [8]:
scores


Out[8]:
array([ 0.11111111,  0.04494382,  0.13483146,  0.04494382])

Try TF-IDF


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

predictor = pipeline.Pipeline([
		('extractor', TfidfVectorizer()),
		('svd', decomposition.TruncatedSVD(50)),
		('svm', svm.SVC(class_weight='auto'))])
books, authors = train_collection.as_arrays()
scores = cross_validation.cross_val_score(predictor, [b.contents() for b in books], authors,
	scoring='accuracy', cv=cross_validation.StratifiedKFold(authors, n_folds=4))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-0596a5e266b5> in <module>()
      7 books, authors = train_collection.as_arrays()
      8 scores = cross_validation.cross_val_score(predictor, [b.contents() for b in books], authors,
----> 9 	scoring='accuracy', cv=cross_validation.StratifiedKFold(authors, n_folds=4))

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, score_func, pre_dispatch)
   1150         delayed(_cross_val_score)(clone(estimator), X, y, scorer, train, test,
   1151                                   verbose, fit_params)
-> 1152         for train, test in cv)
   1153     return np.array(scores)
   1154 

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    515         try:
    516             for function, args, kwargs in iterable:
--> 517                 self.dispatch(function, args, kwargs)
    518 
    519             self.retrieve()

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in dispatch(self, func, args, kwargs)
    310         """
    311         if self._pool is None:
--> 312             job = ImmediateApply(func, args, kwargs)
    313             index = len(self._jobs)
    314             if not _verbosity_filter(index, self.verbose):

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, func, args, kwargs)
    134         # Don't delay the application, to avoid keeping the input
    135         # arguments in memory
--> 136         self.results = func(*args, **kwargs)
    137 
    138     def get(self):

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/cross_validation.py in _cross_val_score(estimator, X, y, scorer, train, test, verbose, fit_params)
   1058         y_train = y[train]
   1059         y_test = y[test]
-> 1060     estimator.fit(X_train, y_train, **fit_params)
   1061     if scorer is None:
   1062         score = estimator.score(X_test, y_test)

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    128         data, then fit the transformed data using the final estimator.
    129         """
--> 130         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    131         self.steps[-1][-1].fit(Xt, y, **fit_params)
    132         return self

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/pipeline.py in _pre_transform(self, X, y, **fit_params)
    118         for name, transform in self.steps[:-1]:
    119             if hasattr(transform, "fit_transform"):
--> 120                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    121             else:
    122                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/decomposition/truncated_svd.py in fit_transform(self, X, y)
    114             Reduced version of X. This will always be a dense array.
    115         """
--> 116         U, Sigma, VT = self._fit(X)
    117         Sigma = np.diag(Sigma)
    118 

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/decomposition/truncated_svd.py in _fit(self, X)
    139             U, Sigma, VT = randomized_svd(X, self.n_components,
    140                                           n_iter=self.n_iterations,
--> 141                                           random_state=random_state)
    142         else:
    143             raise ValueError("unknown algorithm %r" % self.algorithm)

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/utils/extmath.py in randomized_svd(M, n_components, n_oversamples, n_iter, transpose, flip_sign, random_state, n_iterations)
    205         M = M.T
    206 
--> 207     Q = randomized_range_finder(M, n_random, n_iter, random_state)
    208 
    209     # project M to the (k + p) dimensional space using the basis vectors

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/utils/extmath.py in randomized_range_finder(A, size, n_iter, random_state, n_iterations)
    129     # singular vectors of A in Y
    130     for i in xrange(n_iter):
--> 131         Y = safe_sparse_dot(A, safe_sparse_dot(A.T, Y))
    132 
    133     # extracting an orthonormal basis of the A range samples

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
     76     from scipy import sparse
     77     if sparse.issparse(a) or sparse.issparse(b):
---> 78         ret = a * b
     79         if dense_output and hasattr(ret, "toarray"):
     80             ret = ret.toarray()

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/scipy/sparse/base.py in __mul__(self, other)
    252                 return self._mul_vector(other.ravel()).reshape(M, 1)
    253             elif other.ndim == 2  and other.shape[0] == N:
--> 254                 return self._mul_multivector(other)
    255 
    256         if isscalarlike(other):

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/scipy/sparse/compressed.py in _mul_multivector(self, other)
    279         # csr_matvecs or csc_matvecs
    280         fn = getattr(sparsetools,self.format + '_matvecs')
--> 281         fn(M, N, n_vecs, self.indptr, self.indices, self.data, other.ravel(), result.ravel())
    282 
    283         return result

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/scipy/sparse/sparsetools/csc.py in csc_matvecs(*args)
    251         npy_clongdouble_wrapper Yx)
    252     """
--> 253   return _csc.csc_matvecs(*args)
    254 
    255 def csc_elmul_csc(*args):

KeyboardInterrupt: 

In [12]:
scores


Out[12]:
array([ 0.11111111,  0.04494382,  0.14606742,  0.04494382])

In [ ]: